library(tidyverse) # Integration
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # Manipulation
library(ggplot2) # Visualization
library(cluster) # Clustering
library(lubridate) # Dates
library(caret) # Modeling
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(factoextra) # Visualization (specifically for factor analysis and clustering)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(arules) # Association
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'arules'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz) # Visualization (specifically for association rules)
library(reshape2) # Reshaping
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(gridExtra) # Layout
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(123) # for reproducible results
# Load the dataset
dataset <- read.csv("kz.csv")
# Viewing the first few rows of the dataset
head(dataset)
## event_time order_id product_id category_id
## 1 2020-04-24 11:50:39 UTC 2.294360e+18 1.515966e+18 2.268105e+18
## 2 2020-04-24 11:50:39 UTC 2.294360e+18 1.515966e+18 2.268105e+18
## 3 2020-04-24 14:37:43 UTC 2.294444e+18 2.273948e+18 2.268105e+18
## 4 2020-04-24 14:37:43 UTC 2.294444e+18 2.273948e+18 2.268105e+18
## 5 2020-04-24 19:16:21 UTC 2.294584e+18 2.273948e+18 2.268105e+18
## 6 2020-04-26 08:45:57 UTC 2.295717e+18 1.515966e+18 2.268105e+18
## category_code brand price user_id column_name
## 1 electronics.tablet samsung 162.01 1.515916e+18 0
## 2 electronics.tablet samsung 162.01 1.515916e+18 0
## 3 electronics.audio.headphone huawei 77.52 1.515916e+18 0
## 4 electronics.audio.headphone huawei 77.52 1.515916e+18 0
## 5 karcher 217.57 1.515916e+18 0
## 6 furniture.kitchen.table maestro 39.33 1.515916e+18 0
# Basic structure of the dataset
str(dataset)
## 'data.frame': 107159 obs. of 9 variables:
## $ event_time : chr "2020-04-24 11:50:39 UTC" "2020-04-24 11:50:39 UTC" "2020-04-24 14:37:43 UTC" "2020-04-24 14:37:43 UTC" ...
## $ order_id : num 2.29e+18 2.29e+18 2.29e+18 2.29e+18 2.29e+18 ...
## $ product_id : num 1.52e+18 1.52e+18 2.27e+18 2.27e+18 2.27e+18 ...
## $ category_id : num 2.27e+18 2.27e+18 2.27e+18 2.27e+18 2.27e+18 ...
## $ category_code: chr "electronics.tablet" "electronics.tablet" "electronics.audio.headphone" "electronics.audio.headphone" ...
## $ brand : chr "samsung" "samsung" "huawei" "huawei" ...
## $ price : num 162 162 77.5 77.5 217.6 ...
## $ user_id : num 1.52e+18 1.52e+18 1.52e+18 1.52e+18 1.52e+18 ...
## $ column_name : int 0 0 0 0 0 0 0 0 0 0 ...
# Summary statistics
summary(dataset)
## event_time order_id product_id category_id
## Length:107159 Min. :2.294e+18 Min. :1.516e+18 Min. :2.268e+18
## Class :character 1st Qu.:2.337e+18 1st Qu.:1.516e+18 1st Qu.:2.268e+18
## Mode :character Median :2.349e+18 Median :1.516e+18 Median :2.268e+18
## Mean :2.340e+18 Mean :1.686e+18 Mean :2.273e+18
## 3rd Qu.:2.349e+18 3rd Qu.:1.516e+18 3rd Qu.:2.268e+18
## Max. :2.353e+18 Max. :2.349e+18 Max. :2.374e+18
## category_code brand price user_id
## Length:107159 Length:107159 Min. : 0.00 Min. :1.516e+18
## Class :character Class :character 1st Qu.: 15.72 1st Qu.:1.516e+18
## Mode :character Mode :character Median : 50.21 Median :1.516e+18
## Mean : 150.61 Mean :1.516e+18
## 3rd Qu.: 175.90 3rd Qu.:1.516e+18
## Max. :18328.68 Max. :1.516e+18
## column_name
## Min. :0
## 1st Qu.:0
## Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
# Function to count all types of missing values
count_missing <- function(x) {
sum(is.na(x) | x == "" | x == "NA" | x == "NULL" | x == " ")
}
# Count missing values again after replacement
missing_values <- sapply(dataset, count_missing)
# Print the number of missing values per column
print(missing_values)
## event_time order_id product_id category_id category_code
## 0 0 0 0 28737
## brand price user_id column_name
## 5428 0 0 0
# Remove duplicate rows
dataset <- dataset[!duplicated(dataset), ]
# Count and print the number of duplicate rows after removal
num_duplicate_rows <- sum(duplicated(dataset))
print(num_duplicate_rows)
## [1] 0
The category_code column has 28,737 missing values and the brand column has 5,428 missing values. The other columns do not have any missing values.
#Converting the event time to POSIXct for further analysis
# Convert event_time from Factor to character
dataset$event_time <- as.character(dataset$event_time)
# Then convert from character to POSIXct with the proper format and timezone
# Replace the format string with the actual format of our event_time if it's different
dataset$event_time <- as.POSIXct(dataset$event_time, format = "%Y-%m-%d %H:%M:%S UTC", tz = "UTC")
#Replacing brand and category code to Generic to assume the BRAND and Category of the product to be “non branded” or possible variety of products in “E-commerce websites”
# Convert 'brand' and 'category_code' to character type
dataset$brand <- as.character(dataset$brand)
dataset$category_code <- as.character(dataset$category_code)
# Replacing NA and empty values in 'brand' and 'category_code' with 'Generic'
dataset$brand[is.na(dataset$brand) | !nzchar(dataset$brand)] <- "Generic"
dataset$category_code[is.na(dataset$category_code) | !nzchar(dataset$category_code)] <- "Generic"
# Dropping the 'column_name' variable
dataset <- dataset[ , !(names(dataset) %in% c("column_name"))]
# Viewing the first few rows to verify changes
head(dataset)
## event_time order_id product_id category_id
## 1 2020-04-24 11:50:39 2.294360e+18 1.515966e+18 2.268105e+18
## 3 2020-04-24 14:37:43 2.294444e+18 2.273948e+18 2.268105e+18
## 5 2020-04-24 19:16:21 2.294584e+18 2.273948e+18 2.268105e+18
## 6 2020-04-26 08:45:57 2.295717e+18 1.515966e+18 2.268105e+18
## 7 2020-04-26 09:33:47 2.295741e+18 1.515966e+18 2.268105e+18
## 11 2020-04-26 14:55:26 2.295902e+18 2.273948e+18 2.268105e+18
## category_code brand price user_id
## 1 electronics.tablet samsung 162.01 1.515916e+18
## 3 electronics.audio.headphone huawei 77.52 1.515916e+18
## 5 Generic karcher 217.57 1.515916e+18
## 6 furniture.kitchen.table maestro 39.33 1.515916e+18
## 7 electronics.smartphone apple 1387.01 1.515916e+18
## 11 appliances.kitchen.refrigerators lg 462.94 1.515916e+18
# Function to create a bar plot for a given column
create_bar_plot <- function(column, dataset) {
if (!is.numeric(dataset[[column]])) {
# Handling missing values
dataset <- dataset %>% filter(!is.na(.[[column]]))
# Preparing data
top_data <- dataset %>%
count(!!sym(column)) %>%
arrange(desc(n)) %>%
head(30)
# Creating the plot
p <- ggplot(top_data, aes_string(x = column, y = "n")) +
geom_bar(stat = "identity", fill = "orange") +
ggtitle(paste('Top 30 Bar Plot of', column)) +
xlab(column) +
ylab("Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
}
# Apply the function to 'brand' and 'category_code'
for (column in c("brand", "category_code")) {
create_bar_plot(column, dataset)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The above bar plots represent the frequency counts of two different
categorical variables: brand and category_code Bar plot of brand shows
the top 30 brands based on their frequency count. Each bar represents a
unique brand, and the number of occurrences Bar plot of category_code
visualizes the top 30 category codes. Similar to the first plot, each
bar represents a category code, and how many times that category
appears
# Histogram for the 'price' variable
ggplot(dataset, aes(x = price)) +
geom_histogram(bins = 1000, fill = "blue", color = "blue") +
theme_minimal() +
labs(title = "Distribution of Price", x = "Price", y = "Count")
#Histogram for Numeric Variables: To understand the distribution of numeric variables like price. Enhancing Price Data Visualization
dataset$price[dataset$price == 0] <- mean(dataset$price[dataset$price > 0], na.rm = TRUE) # Replace with mean
# Calculate mean or median again after replacing zeros
mean_price <- mean(dataset$price, na.rm = TRUE)
median_price <- median(dataset$price, na.rm = TRUE)
# Create histogram without log scale
histogram_original <- ggplot(dataset, aes(x = price)) +
geom_histogram(binwidth = 500, fill = "skyblue", color = "darkblue") +
geom_vline(xintercept = mean_price, color = "red", linetype = "dashed", size = 1) +
theme_minimal(base_size = 8) +
labs(title = "Distribution of Price",
subtitle = paste("Mean price:", round(mean_price, 2)),
x = "Price",
y = "Count") +
theme(plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Create histogram with log scale and adjust binwidth
histogram_log <- ggplot(dataset, aes(x = price)) +
geom_histogram(fill = "skyblue", color = "darkblue") + # Removed binwidth to let ggplot2 decide
geom_vline(xintercept = mean_price, color = "red", linetype = "dashed", size = 1) +
scale_x_log10() + # Apply log scale to the x-axis
theme_minimal(base_size = 8) +
labs(title = "Log Distribution of Price",
subtitle = paste("Mean price:", round(mean_price, 2)),
x = "Log(Price)",
y = "Count") +
theme(plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12))
# Combine the two histograms with a suitable aspect ratio
combined_histograms <- grid.arrange(histogram_original, histogram_log, ncol = 2, heights = c(1, 1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Save the combined histograms to a file with a larger size
ggsave("combined_histograms.png", combined_histograms, width = 12, height = 6, dpi = 300)
Replacing Zero Prices: It first replaces zero values in the price column with the mean of non-zero prices. Recomputing Mean and Median: After this replacement, it calculates the new mean and median prices. Histogram Creation (Original and Log Scale): Two histograms are created to visualize the distribution of prices. The first histogram is on the original scale, and the second is on a logarithmic scale. Both histograms highlight the mean price with a dashed red line. Combining Histograms: These histograms are then combined side by side for comparative visualization. Saving the Combined Histograms: Finally, the combined histograms are saved as a high-resolution image (combined_histograms.png).
# Calculate mean or median
mean_price <- mean(dataset$price, na.rm = TRUE)
median_price <- median(dataset$price, na.rm = TRUE)
# Create histogram without log scale
ggplot(dataset, aes(x = price)) +
geom_histogram(binwidth = 500, fill = "skyblue", color = "darkblue") +
geom_vline(xintercept = mean_price, color = "red", linetype = "dashed", size = 1) +
theme_minimal(base_size = 8) +
labs(title = "Distribution of Price",
subtitle = paste("Mean price:", round(mean_price, 2)),
x = "Price",
y = "Count") +
theme(plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12))
The histogram above shows the distribution of prices in our dataset. It seems like most of the prices are clustered in the lower range, indicating a right-skewed distribution. This is typical in retail datasets where a large number of low-cost items are sold compared to a few high-priced items.
# Summarize data
top_brands <- dataset %>%
group_by(brand) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
slice_head(n = 10)
#Bar graph of top 10 brands that are frequent
ggplot(top_brands, aes(x = reorder(brand, Count), y = Count)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Frequency of Top 10 Brands",
x = "Brand",
y = "Count")
Bar Plot for Categorical Variables: To visualize the frequency of categories in variables like brand. The distribution depicted by the bars shows a right-skewed tendency, with most of brands having a lower frequency count and a minority having much higher counts. This type of skewness is common in market statistics, where a few brands dominate presence or sales, dominating a larger number of less prevalent brands
# Summarize data for top 10 categories
top_categories <- dataset %>%
group_by(category_code) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
slice_head(n = 10)
# Create a bar graph for top 10 categories
ggplot(top_categories, aes(x = reorder(category_code, Count), y = Count)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Frequency of Top 10 Categories",
x = "Category",
y = "Count")
The bar plot displays the distribution of categories in our dataset, revealing a concentration of frequencies among specific categories, indicating a right-skewed distribution. This pattern is frequently observed in consumer data, reflecting a market in which the majority of transactions are concentrated in a few popular categories, with the rest having far fewer occurrences. Analogous to examining price distributions in numerical data, the visualization aids in understanding the distribution of categorical variables such as category_code.Notably, “electronics.smartphone” has a relatively high count.
# Summary statistics for numerical variables
summary(dataset)
## event_time order_id product_id
## Min. :1970-01-01 00:33:40.00 Min. :2.294e+18 Min. :1.516e+18
## 1st Qu.:2020-02-16 14:01:31.25 1st Qu.:2.337e+18 1st Qu.:1.516e+18
## Median :2020-04-06 06:04:24.50 Median :2.349e+18 Median :1.516e+18
## Mean :2019-09-01 13:23:53.93 Mean :2.340e+18 Mean :1.685e+18
## 3rd Qu.:2020-06-04 12:16:23.00 3rd Qu.:2.349e+18 3rd Qu.:1.516e+18
## Max. :2020-07-14 16:54:28.00 Max. :2.353e+18 Max. :2.349e+18
## category_id category_code brand price
## Min. :2.268e+18 Length:106612 Length:106612 Min. : 0.02
## 1st Qu.:2.268e+18 Class :character Class :character 1st Qu.: 16.18
## Median :2.268e+18 Mode :character Mode :character Median : 50.90
## Mean :2.273e+18 Mean : 150.83
## 3rd Qu.:2.268e+18 3rd Qu.: 178.22
## Max. :2.374e+18 Max. :18328.68
## user_id
## Min. :1.516e+18
## 1st Qu.:1.516e+18
## Median :1.516e+18
## Mean :1.516e+18
## 3rd Qu.:1.516e+18
## Max. :1.516e+18
# Correlation matrix for numerical variables
correlations <- cor(dataset[, sapply(dataset, is.numeric)])
print(correlations)
## order_id product_id category_id price user_id
## order_id 1.00000000 0.03976368 -0.01914090 -0.04345975 0.47937732
## product_id 0.03976368 1.00000000 -0.02767009 -0.08000943 0.02347187
## category_id -0.01914090 -0.02767009 1.00000000 0.16342478 -0.01149384
## price -0.04345975 -0.08000943 0.16342478 1.00000000 -0.03682971
## user_id 0.47937732 0.02347187 -0.01149384 -0.03682971 1.00000000
Insights from Summary Statistics event_time: This column is character type and shows timestamps. We convert it to a Date or DateTime format for time series analysis or to extract features like day of week or hour of day. Numerical IDs (order_id, product_id, category_id, user_id): These seem to be identifiers. Their statistical summary might not be very informative for analysis, but these fields could be important for identifying unique transactions, products, categories, and users. category_code: Contains a significant number of missing values (28,737). The non-missing values provide product categories which could be useful for segmentation or grouping analyses. brand: Also has missing values (5,428) and is a key variable for brand-wise analysis. price: Shows a wide range (0 to 18,328.68). Understanding its distribution and relationship with other variables like brand or category_code could be insightful. column_name: Seems to have a constant value (0). If this column does not vary, it might not be useful for analysis.
Correlation Matrix The correlation matrix shows low correlation coefficients among most variables, indicating weak linear relationships. user_id has a moderate positive correlation with order_id, which might suggest some pattern or relationship between users and order IDs. order_id and product_id (0.03950995): This shows a very weak positive correlation. It suggests that there’s barely any linear relationship between the order ID and the product ID. order_id and category_id (-0.01920324): This is a very weak negative correlation, indicating almost no meaningful linear relationship between order ID and category ID. order_id and price (-0.04374089): Again, a very weak negative correlation, suggesting that there’s no significant linear relationship between the order ID and the price of the product. order_id and user_id (0.47960433): This shows a moderate positive correlation. It suggests that there might be some linear relationship between the order ID and the user ID, possibly indicating that certain users are linked to specific orders. product_id and category_id (-0.02799193): A very weak negative correlation, indicating almost no linear relationship between product ID and category ID. product_id and price (-0.08004854): A weak negative correlation, suggesting a slight tendency for different products (by ID) to have different prices, but the relationship is not strong. product_id and user_id (0.02244055): A very weak positive correlation, indicating almost no linear relationship between product ID and user ID. category_id and price (0.16293907): A weak positive correlation, suggesting that there might be a slight tendency for different categories to have different price ranges. category_id and user_id (-0.01139278): A very weak negative correlation, indicating almost no linear relationship between category ID and user ID. price and user_id (-0.03711405): A very weak negative correlation, suggesting almost no linear relationship between the price of products and user ID. column_name: It seems like this column might be a placeholder or an error since it has NA (Not Available) for all correlations with other variables and a perfect correlation of 1 with itself. This is typical for a non-numeric or a constant column.
# Converting 'event_time' to DateTime format
dataset$event_time <- as.POSIXct(dataset$event_time, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
# Checking the structure and summary again
str(dataset)
## 'data.frame': 106612 obs. of 8 variables:
## $ event_time : POSIXct, format: "2020-04-24 11:50:39" "2020-04-24 14:37:43" ...
## $ order_id : num 2.29e+18 2.29e+18 2.29e+18 2.30e+18 2.30e+18 ...
## $ product_id : num 1.52e+18 2.27e+18 2.27e+18 1.52e+18 1.52e+18 ...
## $ category_id : num 2.27e+18 2.27e+18 2.27e+18 2.27e+18 2.27e+18 ...
## $ category_code: chr "electronics.tablet" "electronics.audio.headphone" "Generic" "furniture.kitchen.table" ...
## $ brand : chr "samsung" "huawei" "karcher" "maestro" ...
## $ price : num 162 77.5 217.6 39.3 1387 ...
## $ user_id : num 1.52e+18 1.52e+18 1.52e+18 1.52e+18 1.52e+18 ...
summary(dataset)
## event_time order_id product_id
## Min. :1970-01-01 00:33:40.00 Min. :2.294e+18 Min. :1.516e+18
## 1st Qu.:2020-02-16 14:01:31.25 1st Qu.:2.337e+18 1st Qu.:1.516e+18
## Median :2020-04-06 06:04:24.50 Median :2.349e+18 Median :1.516e+18
## Mean :2019-09-01 13:23:53.93 Mean :2.340e+18 Mean :1.685e+18
## 3rd Qu.:2020-06-04 12:16:23.00 3rd Qu.:2.349e+18 3rd Qu.:1.516e+18
## Max. :2020-07-14 16:54:28.00 Max. :2.353e+18 Max. :2.349e+18
## category_id category_code brand price
## Min. :2.268e+18 Length:106612 Length:106612 Min. : 0.02
## 1st Qu.:2.268e+18 Class :character Class :character 1st Qu.: 16.18
## Median :2.268e+18 Mode :character Mode :character Median : 50.90
## Mean :2.273e+18 Mean : 150.83
## 3rd Qu.:2.268e+18 3rd Qu.: 178.22
## Max. :2.374e+18 Max. :18328.68
## user_id
## Min. :1.516e+18
## 1st Qu.:1.516e+18
## Median :1.516e+18
## Mean :1.516e+18
## 3rd Qu.:1.516e+18
## Max. :1.516e+18
# Now lets get back to checking duplicate rows and missing values, etc.
# Check for duplicate rows
dupe_rows <- dataset[duplicated(dataset), ]
if(nrow(dupe_rows) > 0) {
print("Duplicate rows detected!")
print(nrow(dupe_rows))
} else {
print("No duplicate rows found.")
}
## [1] "No duplicate rows found."
# Filter invalid rows and filter out rows with price less than or equal to zero.
dataset <- dataset %>%
filter(!is.na(event_time)) %>%
filter(price > 0)
nrow(dataset) #Number of rows after deleting the rows with duplicates
## [1] 106612
# Aggregate
brand_orders <- dataset %>%
group_by(brand) %>%
summarize(num_orders = n())
print(brand_orders)
## # A tibble: 649 × 2
## brand num_orders
## <chr> <int>
## 1 Generic 5397
## 2 a-case 120
## 3 acana 3
## 4 accesstyle 9
## 5 activision 9
## 6 adidas 2
## 7 advantek 75
## 8 aeg 11
## 9 aerocool 96
## 10 aimoto 34
## # ℹ 639 more rows
#Mean price for each category
category_price <- dataset %>%
group_by(category_code) %>%
summarize(mean_price = mean(price))
print(category_price)
## # A tibble: 121 × 2
## category_code mean_price
## <chr> <dbl>
## 1 Generic 58.5
## 2 accessories.bag 20.6
## 3 accessories.umbrella 113.
## 4 apparel.costume 15.0
## 5 apparel.glove 382.
## 6 apparel.shirt 31.9
## 7 apparel.sock 114.
## 8 apparel.trousers 27.8
## 9 apparel.tshirt 21.5
## 10 appliances.environment.air_conditioner 238.
## # ℹ 111 more rows
# Filter data
dataset <- dataset %>%
filter(event_time > "2020-01-01")
# Create date and aggregate orders
dataset <- dataset %>%
mutate(date = as.Date(event_time))
orders_per_day <- dataset %>%
count(date)
# Plot with customizations
ggplot(orders_per_day, aes(x = date, y = n)) +
# Blue line
geom_line(color = "blue") +
# Label
ggtitle("Orders per Day") +
# X label
xlab("Year 2020") +
# Y label
ylab("Number of Orders") +
# Theme
theme_minimal()
Time Series Analysis of Daily Orders in 2020
The “Orders per Day” graph illustrates a time series of daily order counts for the year 2020. Observations from the graph include:
Cyclical Fluctuations: A recurring pattern of peaks and troughs suggests a correlation with pay cycles (“ups”) and month ends (“downs”). Trend Over Time: Starting April, there is a noticeable upward trend, indicating an overall increase in daily orders as the year progresses. Significant Peaks: Exceptionally high peaks, particularly one in January and a substantial rise in July, may point to special events or promotions influencing order volumes. Stabilization and Surge: After an initial period of high variability, the order count stabilizes from April to June, followed by a sharp increase in July. COVID-19 Considerations: The year 2020’s unique context of the COVID-19 pandemic likely impacted consumer behavior, potentially explaining the surge in online orders due to restrictions and lifestyle changes.
ggplot(dataset, aes(x = event_time)) +
geom_histogram(aes(fill = ..count..)) +
scale_fill_gradient(low = "skyblue", high = "blue") +
labs(title = "Event Time Distribution", x = "Event Time", y = "Count") +
theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Get top 30 brands
top_brands <- dataset %>%
count(brand) %>%
slice_max(order_by = n, n = 30)
ggplot(top_brands, aes(x = brand, y = n)) +
geom_col(fill = "orange") +
ggtitle("Top 30 Brands by Order Count") +
xlab("Brand") + # Label for the x-axis
ylab("Order Count") + # Label for the y-axis
theme(axis.text.x = element_text(angle = 90))
# Converting event_time to POSIXct format and extracting time-based features
dataset$event_time <- as.POSIXct(dataset$event_time, format="%Y-%m-%d %H:%M:%S UTC", tz="UTC")
dataset$year <- format(dataset$event_time, "%Y")
dataset$month <- format(dataset$event_time, "%m")
dataset$day_of_week <- format(dataset$event_time, "%u") # 1 = Monday, ..., 7 = Sunday
dataset$hour_of_day <- format(dataset$event_time, "%H")
This chunk extracts the year, month, day of the week, and hour of the day from the event_time. These time-based features can be very useful as they allow us to capture patterns based on time (like seasonality, weekday vs. weekend behavior, etc.)
#Grouping Users into segments
# Grouping data by user_id to calculate purchase history metrics
purchase_history <- dataset %>%
group_by(user_id) %>%
summarise(total_purchases = n_distinct(order_id),
average_purchase_value = mean(price, na.rm = TRUE),
recent_purchase = max(event_time))
# Calculating days since the last purchase
purchase_history$days_since_last_purchase <- as.numeric(difftime(max(dataset$event_time),
purchase_history$recent_purchase,
units = "days"))
# Merging this information back into the main dataset
dataset <- merge(dataset, purchase_history, by = "user_id")
This chunk uses dplyr to group the data by user_id and then calculates several metrics: the total number of unique purchases (total_purchases), the average purchase value (average_purchase_value), and the time of the most recent purchase (recent_purchase). It then calculates the number of days since the last purchase for each user. This is done by finding the difference in days between the most recent purchase of each user and the most recent purchase in the entire dataset. Finally, this information is merged back into the main dataset, enriching it with user-specific historical data, which is often crucial for predictive models in e-commerce contexts.
This chunk removes columns that are no longer needed for modeling. These include the original event_time, order_id, product_id, category_id, category_code, and brand columns. The reason for dropping these columns is that they are either redundant (we have extracted all useful information from them) or not useful for prediction (like IDs that don’t have predictive power).
# Frequency encoding approach for 'category_code'
category_freq <- prop.table(table(factor(dataset$category_code, levels = unique(dataset$category_code))))
dataset$category_code_freq <- sapply(dataset$category_code, function(x) {
if (is.na(x)) {
return(0)
} else {
return(category_freq[as.character(x)])
}
})
# Alternative frequency encoding approach for 'brand'
brand_freq <- prop.table(table(factor(dataset$brand, levels = unique(dataset$brand))))
dataset$brand_freq <- sapply(dataset$brand, function(x) {
if (is.na(x)) {
return(0)
} else {
return(brand_freq[as.character(x)])
}
})
This chunk applies frequency encoding to the category_code and brand columns. It calculates the frequency of each category and brand in the dataset. It then replaces each category and brand in the original dataset with its corresponding frequency. This helps in handling categorical variables with many unique values and can provide a meaningful way to include this information in the model.
First, the code ensures that event_time is converted to a Date format. This is important for accurate date comparisons. Then, it defines cutoff_date as the date one month before the latest date in the dataset. The ifelse statement creates the will_purchase variable, where 1 indicates a purchase after the cutoff date (i.e., within the next month), and 0 otherwise. Lastly, it drops the event_time column as it’s no longer needed after creating the target variable.
# Calculating the total number of unique users
total_unique_users <- n_distinct(dataset$user_id)
# Data Aggregation at User Level
user_data <- dataset %>%
group_by(user_id) %>%
summarise(
number_of_orders = n_distinct(order_id),
total_spending = sum(price, na.rm = TRUE)
)
# Viewing the total number of unique users
print(paste("Total number of unique users:", total_unique_users))
## [1] "Total number of unique users: 26864"
# Viewing the first few rows of the aggregated user data
head(user_data)
## # A tibble: 6 × 3
## user_id number_of_orders total_spending
## <dbl> <int> <dbl>
## 1 1.52e18 1 417.
## 2 1.52e18 1 625.
## 3 1.52e18 2 183.
## 4 1.52e18 1 856.
## 5 1.52e18 3 197.
## 6 1.52e18 10 3453.
# Data Aggregation at User Level
user_data <- dataset %>%
group_by(user_id) %>%
summarise(
total_spending = sum(price, na.rm = TRUE),
average_spending_per_order = mean(price, na.rm = TRUE),
number_of_orders = n()
)
# Scaling the data
user_data_scaled <- scale(user_data[,-1]) # Excluding user_id for scaling
# Applying K-Means Clustering
set.seed(123) # Set seed for reproducibility
kmeans_result <- kmeans(user_data_scaled, centers = 4, nstart = 25) # Change 'centers' based on our requirement
# Adding cluster information to the user data
user_data$cluster <- kmeans_result$cluster
# Viewing the first few rows of the clustered data
head(user_data)
## # A tibble: 6 × 5
## user_id total_spending average_spending_per_order number_of_orders cluster
## <dbl> <dbl> <dbl> <int> <int>
## 1 1.52e18 417. 417. 1 2
## 2 1.52e18 625. 625. 1 2
## 3 1.52e18 183. 91.4 2 4
## 4 1.52e18 856. 856. 1 1
## 5 1.52e18 197. 65.7 3 4
## 6 1.52e18 3453. 182. 19 2
# Analyzing cluster characteristics
cluster_summary <- user_data %>%
group_by(cluster) %>%
summarise(
average_total_spending = mean(total_spending),
average_spending_per_order = mean(average_spending_per_order),
average_number_of_orders = mean(number_of_orders)
)
# Print cluster summaries
print(cluster_summary)
## # A tibble: 4 × 4
## cluster average_total_spending average_spending_per_o…¹ average_number_of_or…²
## <int> <dbl> <dbl> <dbl>
## 1 1 1494. 985. 1.56
## 2 2 948. 346. 3.38
## 3 3 33729. 153. 253.
## 4 4 180. 73.5 2.29
## # ℹ abbreviated names: ¹average_spending_per_order, ²average_number_of_orders
# Visualizing the clusters
ggplot(user_data, aes(x = number_of_orders, y = total_spending, color = as.factor(cluster))) +
geom_point() +
labs(title = "User Clusters", x = "number of orders", y = "Total Spending", color = "Cluster") +
theme_minimal()
# Visualizing the clusters
ggplot(user_data, aes(x = average_spending_per_order, y = total_spending, color = as.factor(cluster))) +
geom_point() +
labs(title = "User Clusters", x = "average_spending_per_order", y = "Total Spending", color = "Cluster") +
theme_minimal()
Cluster Density and Spread: The density of the points (how close they are to each other) and their spread (how far they stretch on the graph) give an idea of the variance within each cluster. A high density indicates that users within a cluster have similar spending behaviors. Cluster Centers: Ideally, we would also plot the centers of each cluster to see where the “average” user in each cluster lies on the graph. These are not shown but can be added to the plot for better interpretation. Cluster Characteristics: Cluster 1 (Red): These users seem to have a lower average spending per order and lower total spending. This cluster could represent “Occasional Shoppers” or “Budget-Conscious” shoppers. Cluster 2 (Green): Users in this cluster have a wide range of average spending but generally lower total spending. They might be “Selective Shoppers” who purchase infrequently but spend varying amounts when they do. Cluster 3 (Blue): This cluster has few users with very high total spending and also high average spending per order. These could be “Premium Shoppers” or “High-Value Customers.” Outliers: There are some points that stand far away from others, especially in Cluster 3. These outliers can significantly affect the average values of the cluster and may need further investigation or different handling in the analysis.
# Ensure user_data has the cluster assignments
user_data <- user_data %>%
mutate(cluster = kmeans_result$cluster)
# First, add the cluster assignments to the original dataset
dataset_with_clusters <- dataset %>%
inner_join(user_data, by = "user_id")
# Now we can group by cluster as well as other attributes
cluster_attributes <- dataset_with_clusters %>%
group_by(cluster, category_code, brand) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count))
# Printing the most common brands and categories per cluster
print(cluster_attributes)
## # A tibble: 3,527 × 4
## cluster category_code brand count
## <int> <chr> <chr> <int>
## 1 4 electronics.smartphone samsung 2886
## 2 2 electronics.smartphone samsung 2242
## 3 3 electronics.smartphone samsung 1472
## 4 4 Generic Generic 1143
## 5 3 Generic Generic 1116
## 6 2 electronics.smartphone apple 759
## 7 4 Generic samsung 665
## 8 3 Generic samsung 627
## 9 3 electronics.smartphone huawei 622
## 10 4 electronics.smartphone huawei 606
## # ℹ 3,517 more rows
# Adjusting n_top to 3
n_top <- 3
top_categories <- cluster_attributes %>%
group_by(cluster) %>%
slice_max(order_by = count, n = n_top) %>%
ungroup()
# Visualizing the top 3 categories for each cluster
ggplot(top_categories, aes(x = reorder(category_code, count), y = count, fill = as.factor(cluster))) +
geom_bar(stat = "identity") +
facet_wrap(~cluster, scales = "free_y") +
labs(title = "Top 3 Categories per Cluster", x = "Category", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Cluster 1 (Red): The ‘electronics.smartphone’ category has the highest count, indicating that users in this cluster are particularly interested in smartphones. The ‘computers.notebook’ and ‘appliances.environment.vacuum’ categories also appear, suggesting an interest in computing and home appliances. Cluster 2 (Green): This cluster has a relatively balanced distribution among the top categories, with ‘computers.notebook’ leading. It’s followed by ‘electronics.smartphone’ and ‘appliances.environment.vacuum’, similar to Cluster 1, but with significantly lower counts. Cluster 3 (Blue): The ‘appliances.environment.vacuum’ and ‘electronics.smartphone’ categories are the most common, with a similar count. This suggests that users in Cluster 3 are interested in both home appliances and electronics, particularly smartphones. ‘Generic’ Category: All clusters include purchases in the ‘Generic’ category, which are likely items with unspecified or missing category information. The presence of this category across all clusters indicates a data classification issue that could be investigated further. Cluster Comparisons: Cluster 1 has the highest counts in popular categories, suggesting they may be the most active or largest cluster. Cluster 2 has lower activity levels across these top categories. Cluster 3, while similar to Cluster 1 in terms of category interests, seems to have fewer purchases in each category, or it could indicate a smaller size of the cluster.
# Calculating the number of orders and total amount spent by each user
user_analysis <- dataset %>%
group_by(user_id) %>%
summarize(number_of_orders = n(), total_spent = sum(price)) %>%
arrange(desc(number_of_orders), desc(total_spent))
# Identifying the top 50 users based on number of orders and total amount spent
top_50_users <- head(user_analysis, 50)
# Viewing the top 50 users
top_50_users
## # A tibble: 50 × 3
## user_id number_of_orders total_spent
## <dbl> <int> <dbl>
## 1 1.52e18 704 83492.
## 2 1.52e18 583 62428.
## 3 1.52e18 553 78593.
## 4 1.52e18 538 61816.
## 5 1.52e18 520 64514.
## 6 1.52e18 500 53265.
## 7 1.52e18 478 65939.
## 8 1.52e18 468 60756.
## 9 1.52e18 465 50231.
## 10 1.52e18 455 57528.
## # ℹ 40 more rows
# Bar plot for total amount spent
ggplot(top_50_users, aes(x = reorder(user_id, total_spent), y = total_spent)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(title = "Top 50 Users by Total Amount Spent",
x = "User ID",
y = "Total Amount Spent") +
coord_flip() # Flipping coordinates for better readability
# Merging the top 50 users with the original dataset
top_50_users_data <- dataset %>%
semi_join(top_50_users, by = "user_id")
# Aggregating the total spending in each category for the top 30 users
category_spending_top_50 <- top_50_users_data %>%
group_by(category_code) %>%
summarize(total_spent = sum(price)) %>%
arrange(desc(total_spent))
# Viewing the spending by category
category_spending_top_50
## # A tibble: 96 × 2
## category_code total_spent
## <chr> <dbl>
## 1 electronics.smartphone 498120.
## 2 Generic 300639.
## 3 computers.notebook 236339.
## 4 electronics.video.tv 203717.
## 5 appliances.kitchen.refrigerators 176251.
## 6 appliances.kitchen.washer 115786.
## 7 appliances.environment.vacuum 71498.
## 8 appliances.kitchen.hood 69594.
## 9 appliances.kitchen.oven 54055.
## 10 electronics.clocks 50362.
## # ℹ 86 more rows
This chunk shows categories that top 50 Users have purchased.
# Selecting the top 10 users
top_10_users <- head(user_analysis, 10)
# Merging the top 10 users with the original dataset
top_10_users_data <- dataset %>%
semi_join(top_10_users, by = "user_id")
# Analyzing and plotting for each of the top 10 users
for (user in top_10_users$user_id) {
user_data <- top_10_users_data %>%
filter(user_id == user) %>%
group_by(category_code) %>%
summarize(total_spent = sum(price)) %>%
arrange(desc(total_spent)) %>%
head(20) # Top 20 categories
# Creating a bar plot for each user
p <- ggplot(user_data, aes(x = reorder(category_code, total_spent), y = total_spent)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(title = paste("Top 20 Categories for User", user),
x = "Category",
y = "Total Spent") +
coord_flip()
print(p) # Displaying the plot
}
dataset_with_clusters %>%
group_by(category_code) %>%
summarise(Count = n()) %>%
top_n(30, Count) %>%
ggplot(aes(x = reorder(category_code, Count), y = Count, fill = Count)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "red") + # Gradient color
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Frequency of Top 30 Product Categories")
# Check if 'cluster' column exists and inspect its first few values
head(dataset$cluster)
## NULL
# If 'cluster' is a factor, convert it to character
dataset_with_clusters$cluster <- as.character(dataset_with_clusters$cluster)
# Now, filter out the smallest cluster
smallest_cluster <- dataset_with_clusters %>%
filter(cluster == '1')
Price: Range: Prices range from a minimum of $0.02 to a maximum of $1504.61, indicating a wide variety of products in terms of price. Average Purchase Value: On average, items cost about $78.51, but the median value of $39.33 suggests a right-skewed distribution (more low-priced items). Date (date, year, month, day_of_week, hour_of_day): These fields indicate when purchases were made. The latest purchase in the data is on 2020-07-14. The presence of year, month, day, and hour fields suggests potential for detailed time-based analysis, like determining peak purchase times. Total Purchases and Number of Orders: The average number of purchases per user is around 3.274, with a maximum of 48. This indicates varying levels of engagement among customers. The average number of orders is 5.413, suggesting some orders contain multiple items. Total Spending and Average Spending Per Order: Reflects the total and average expenditure per order. With an average spending of about $444.40 but a median of only $196.74, the data might be right-skewed, indicating that while most customers spend less, a few spend significantly more. Days Since Last Purchase: Indicates customer engagement. The mean of about 42.49 days suggests a moderate frequency of purchases in this cluster. Category and Brand Frequencies (category_code_freq, brand_freq): These might represent how often certain categories or brands are purchased within this cluster. Higher frequency values could indicate popular categories or brands.
# Calculate Recency, Frequency, and Monetary values
rfm_table <- smallest_cluster %>%
group_by(user_id) %>%
summarize(Recency = as.numeric(difftime(max(date), max(recent_purchase), units = "days")),
Frequency = n(),
Monetary = sum(total_spending))
# Score each metric (Here, using quantiles for scoring)
rfm_table <- rfm_table %>%
mutate(R_Score = ntile(Recency, 5),
F_Score = ntile(Frequency, 5),
M_Score = ntile(Monetary, 5))
# Create a combined RFM score and segment customers
# Higher scores are better, so we might invert the R score
rfm_table <- rfm_table %>%
mutate(RFM_Score = (6 - R_Score) + F_Score + M_Score,
Segment = case_when(
RFM_Score >= 12 ~ 'Top Customers',
RFM_Score >= 9 ~ 'High Value',
RFM_Score >= 6 ~ 'Medium Value',
TRUE ~ 'Low Value'
))
head(rfm_table)
## # A tibble: 6 × 9
## user_id Recency Frequency Monetary R_Score F_Score M_Score RFM_Score Segment
## <dbl> <dbl> <int> <dbl> <int> <int> <int> <dbl> <chr>
## 1 1.52e18 -0.739 1 856. 1 1 2 8 Medium V…
## 2 1.52e18 -0.204 1 729. 5 1 1 3 Low Value
## 3 1.52e18 -0.430 1 926. 3 1 2 6 Medium V…
## 4 1.52e18 -0.592 1 880. 2 1 2 7 Medium V…
## 5 1.52e18 -0.509 1 856. 2 1 2 7 Medium V…
## 6 1.52e18 -0.617 5 17511. 2 5 5 14 Top Cust…
Top Customers (RFM Score >= 12): These are our most valuable customers. They have purchased recently, purchase frequently, and spent the most. Strategies: Focus on loyalty programs, upselling, and cross-selling to maintain their engagement. High Value (RFM Score >= 9 but < 12): Customers in this segment are valuable but might not score high in all three RFM metrics. They could be frequent buyers who spend less per purchase or recent customers who haven’t shopped frequently yet. Strategies: Encourage increased spending or more frequent purchases through targeted offers or promotions. Medium Value (RFM Score >= 6 but < 9): These customers are moderately engaged. They might be occasional shoppers or have made a few large purchases. Strategies: Engage with re-marketing campaigns, personalized communications, and offers to increase their purchase frequency or value. Low Value (RFM Score < 6): Customers with the lowest engagement, either due to infrequent purchases, low spending, or older purchase history. They might be one-time buyers or infrequent shoppers. Strategies: Reactivation campaigns, special offers to bring them back, or feedback surveys to understand their low engagement.
mean(smallest_cluster$price)
## [1] 955.8577
smallest_cluster$avg_spend <- ifelse(smallest_cluster$price > 78.50, 1, 0)
splitIndex <- sample(c(1:dim(smallest_cluster)[1]), dim(smallest_cluster)[1]*0.7)
train_cluster1 <- smallest_cluster[splitIndex,]
test_cluster1 <- smallest_cluster[-splitIndex,]
#Dataset is 'smallest_cluster' and contains 'order_id' and 'product_id'
#Convert the dataset to a suitable format for the Apriori algorithm
# Creating a basket format dataset
# Converting the list to a transaction class
names(smallest_cluster)
## [1] "user_id" "event_time"
## [3] "order_id" "product_id"
## [5] "category_id" "category_code"
## [7] "brand" "price"
## [9] "date" "year"
## [11] "month" "day_of_week"
## [13] "hour_of_day" "total_purchases"
## [15] "average_purchase_value" "recent_purchase"
## [17] "days_since_last_purchase" "category_code_freq"
## [19] "brand_freq" "total_spending"
## [21] "average_spending_per_order" "number_of_orders"
## [23] "cluster" "avg_spend"
mba <- smallest_cluster[ , c(5, 6, 7, 13)]
head(mba)
## category_id category_code brand hour_of_day
## 1 2.268105e+18 electronics.smartphone apple 17
## 2 2.374499e+18 electronics.video.tv tcl 04
## 3 2.268105e+18 electronics.smartphone apple 10
## 4 2.268105e+18 computers.notebook asus 14
## 5 2.268105e+18 electronics.smartphone apple 12
## 6 2.374499e+18 electronics.video.tv samsung 14
trans <- as(mba, "transactions")
## Warning: Column(s) 1, 2, 3, 4 not logical or factor. Applying default
## discretization (see '? discretizeDF').
# Now we can proceed with finding frequent itemsets and generating rules
rules <- apriori(trans, parameter = list(supp = 0.10, conf = 0.2), target = "rules")
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.2 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 168
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[189 item(s), 1687 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [23 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Viewing results
rules.sorted = sort(rules, by = "confidence")
inspect(head(rules.sorted))
## lhs rhs support confidence coverage lift count
## [1] {category_code=electronics.smartphone,
## brand=samsung} => {category_id=[2.26810543e+18,2.37449891e+18]} 0.1849437 1.0000000 0.1849437 1.653922 312
## [2] {category_code=electronics.smartphone,
## brand=apple} => {category_id=[2.26810543e+18,2.37449891e+18]} 0.2614108 1.0000000 0.2614108 1.653922 441
## [3] {category_code=electronics.smartphone} => {category_id=[2.26810543e+18,2.37449891e+18]} 0.4611737 0.9987163 0.4617664 1.651798 778
## [4] {category_code=computers.notebook} => {category_id=[2.26810539e+18,2.26810541e+18)} 0.1754594 0.9899666 0.1772377 3.058743 296
## [5] {category_id=[2.26810543e+18,2.37449891e+18],
## brand=apple} => {category_code=electronics.smartphone} 0.2614108 0.9887892 0.2643746 2.141319 441
## [6] {category_id=[2.26810543e+18,2.37449891e+18],
## brand=samsung} => {category_code=electronics.smartphone} 0.1849437 0.8547945 0.2163604 1.851140 312
summary(rules.sorted)
## set of 23 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3
## 5 12 6
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 2.043 2.500 3.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.1755 Min. :0.2590 Min. :0.1772 Min. :1.000
## 1st Qu.:0.1849 1st Qu.:0.4191 1st Qu.:0.2629 1st Qu.:1.303
## Median :0.2614 Median :0.6046 Median :0.4612 Median :1.548
## Mean :0.2805 Mean :0.6508 Mean :0.5104 Mean :1.600
## 3rd Qu.:0.2940 3rd Qu.:0.8450 3rd Qu.:0.6046 3rd Qu.:1.687
## Max. :0.6046 Max. :1.0000 Max. :1.0000 Max. :3.059
## count
## Min. : 296.0
## 1st Qu.: 312.0
## Median : 441.0
## Mean : 473.2
## 3rd Qu.: 496.0
## Max. :1020.0
##
## mining info:
## data ntransactions support confidence
## trans 1687 0.1 0.2
## call
## apriori(data = trans, parameter = list(supp = 0.1, conf = 0.2), target = "rules")
Rule [1]: {category_code=electronics.smartphone} => {price=[74,1.5e+03]} Interpretation: Customers who buy electronics, specifically smartphones, are likely to spend between $74 and $1500. This rule suggests a common price range for smartphones in our dataset. Rule [2]: {price=[20.8,74)} => {total_purchases=[1,3)} Interpretation: Transactions where the price of items is between $20.8 and $74 are typically associated with customers who have made between 1 and 3 total purchases. This might indicate a purchasing pattern or customer segment that buys mid-priced items and has a relatively low frequency of purchases. Rule [3]: {} => {total_purchases=[1,3)}` Interpretation: This rule is a bit unusual as it has an empty left-hand side (LHS). It might suggest that, generally, most customers in our dataset fall into the category of making 1 to 3 total purchases, regardless of other factors. Rule [4]: {price=[74,1.5e+03]} => {total_purchases=[1,3)} Interpretation: Customers who purchase items in the price range of $74 to $1500 tend to have a total purchase count of between 1 and 3. This could imply that higher-priced items are bought infrequently. Rule [5]: {price=[0.02,20.8)} => {total_purchases=[1,3)} Interpretation: Transactions with item prices ranging from $0.02 to $20.8 are associated with customers having 1 to 3 total purchases. This suggests that lower-priced items are also bought by customers who do not purchase very frequently. Rule [6]: {category_code=Generic} => {total_purchases=[1,3)} Interpretation: Customers buying items from a generic category (or a category not specifically defined) tend to have a total purchase count of between 1 and 3. This might indicate a general buying pattern among customers purchasing generic items.
# change recent purchases to suitable date format
dataset$recent_purchase <- as.Date(dataset$recent_purchase, format = "%Y-%m-%d")
dataset$recent_purchase <- as.Date(dataset$recent_purchase)
# Aggregate data for RFM analysis
rfm_data <- dataset %>%
group_by(user_id) %>%
summarize(
Recency = as.numeric(max(recent_purchase)),
Frequency = n_distinct(order_id),
Monetary = sum(price, na.rm = TRUE)
)
# Set the reference date for recency calculation
reference_date <- max(rfm_data$Recency) + 1
# Calculate Recency as the number of days since the last purchase
rfm_data$Recency <- reference_date - rfm_data$Recency
# Function to create custom segments for Frequency
frequency_score <- function(x) {
if (x == 1) {
return(1)
} else if (x == 2) {
return(2)
} else if (x >= 3 & x <= 5) {
return(3)
} else {
return(4)
}
}
# Apply custom segmentation to Frequency
rfm_data$Frequency_Score <- sapply(rfm_data$Frequency, frequency_score)
# Segment Recency and Monetary using quantiles
rfm_data$Recency_Score <- ntile(rfm_data$Recency, 4)
rfm_data$Monetary_Score <- ntile(rfm_data$Monetary, 4)
# Combine scores into a single string
rfm_data$RFM_Segment <- paste0(rfm_data$Recency_Score, rfm_data$Frequency_Score, rfm_data$Monetary_Score)
# Create a table of counts for each RFM segment
rfm_table <- table(rfm_data$RFM_Segment)
# Convert the table to a dataframe for plotting
rfm_score_counts <- as.data.frame(rfm_table)
names(rfm_score_counts) <- c("RFM_Segment", "Count")
# Melt the data for use with ggplot2
rfm_melted <- melt(rfm_score_counts, id.vars = "RFM_Segment", value.name = "Count")
# Plot the heatmap with improved visualization
ggplot(rfm_melted, aes(x = RFM_Segment, y = variable, fill = Count)) +
geom_tile(color = "white") +
scale_fill_gradientn(colors = c("blue", "green", "yellow", "red"),
values = scales::rescale(c(0, 0.5, 0.75, 1))) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_blank(),
plot.title = element_text(hjust = 0.5)) +
labs(title = "Enhanced Heatmap of Customer Distribution Across RFM Scores",
x = "RFM Segment",
y = "",
fill = "Count") +
guides(fill = guide_colorbar(title.position = "top", title.hjust = 0.5))
# Recency, Frequency and Monetary values of Unique Users
# Count the number of customers in each RFM segment
rfm_counts <- rfm_data %>%
group_by(RFM_Segment) %>%
summarise(Count = n())
# Create a bar plot
ggplot(rfm_counts, aes(x = reorder(RFM_Segment, -Count), y = Count, fill = RFM_Segment)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
plot.title = element_text(hjust = 0.5),
legend.position = "none") +
labs(title = "Customer Distribution Across RFM Segments",
x = "RFM Segment",
y = "Number of Customers")
names(dataset_with_clusters)
## [1] "user_id" "event_time"
## [3] "order_id" "product_id"
## [5] "category_id" "category_code"
## [7] "brand" "price"
## [9] "date" "year"
## [11] "month" "day_of_week"
## [13] "hour_of_day" "total_purchases"
## [15] "average_purchase_value" "recent_purchase"
## [17] "days_since_last_purchase" "category_code_freq"
## [19] "brand_freq" "total_spending"
## [21] "average_spending_per_order" "number_of_orders"
## [23] "cluster"
# Filtering for Apple and Samsung Brands
apple_samsung_data <- dataset_with_clusters %>%
filter(brand %in% c("apple", "samsung")) %>%
group_by(date, brand) %>%
summarize(DailyCount = n())
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
head(apple_samsung_data)
## # A tibble: 6 × 3
## # Groups: date [3]
## date brand DailyCount
## <date> <chr> <int>
## 1 2020-01-05 apple 27
## 2 2020-01-05 samsung 71
## 3 2020-01-06 apple 20
## 4 2020-01-06 samsung 80
## 5 2020-01-07 apple 14
## 6 2020-01-07 samsung 95
ggplot(apple_samsung_data, aes(x = as.Date(date), y = DailyCount, color = brand)) +
geom_line() +
labs(title = "Daily Purchases for Apple vs Samsung Over Time", x = "Date", y = "Daily Purchase Count") +
theme_minimal()
Comparative Analysis of Brand Performance Over Time
The line graph illustrates the daily purchase counts for Apple and Samsung products, spanning from January to July. Observations are as follows:
The blue line indicates Samsung’s daily purchases, which consistently surpass Apple’s, represented by the red line. This suggests a stronger daily sales performance from Samsung across the observed period. Apple’s purchase counts are generally lower but experience occasional sharp increases, hinting at sporadic days with significantly higher sales. Samsung shows greater variability with several peaks, most notably a sharp increase as we approach July. This could indicate special sales events or new product releases. The comparative dynamics between the two brands reveal that while Samsung maintains a lead in daily purchases, Apple’s peaks suggest successful sales drives or product launches that temporarily boost its numbers. This graph offers insights into the purchasing patterns of consumers for these two brands, with implications for marketing strategies, inventory planning, and understanding market trends.
# Preparing Data for Cluster Distribution Plot
apple_samsung_cluster_data <- dataset_with_clusters %>%
filter(brand %in% c("apple", "samsung")) %>%
group_by(brand, cluster) %>%
summarize(Count = n())
## `summarise()` has grouped output by 'brand'. You can override using the
## `.groups` argument.
# Cluster Distribution Plot
ggplot(apple_samsung_cluster_data, aes(x = brand, y = Count, fill = as.factor(cluster))) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Cluster Distribution for Apple and Samsung", x = "Brand", y = "Count") +
theme_minimal() +
scale_fill_discrete(name = "Cluster")
# Cluster Analysis
## Time Series Analysis of Cluster Purchases
#To see which cluster has more purchases over time, we will create a time series plot. This will show the number of purchases for each cluster across the specified time period.
cluster_time_series <- dataset_with_clusters %>%
group_by(date, cluster) %>%
summarize(DailyPurchases = n())
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
head(cluster_time_series)
## # A tibble: 6 × 3
## # Groups: date [2]
## date cluster DailyPurchases
## <date> <chr> <int>
## 1 2020-01-05 1 1
## 2 2020-01-05 2 159
## 3 2020-01-05 3 357
## 4 2020-01-05 4 103
## 5 2020-01-06 1 3
## 6 2020-01-06 2 31
ggplot(cluster_time_series, aes(x = as.Date(date), y = DailyPurchases, color = as.factor(cluster))) +
geom_line() +
labs(title = "Daily Purchases by Cluster Over Time", x = "Date", y = "Daily Purchases") +
theme_minimal() +
scale_color_discrete(name = "Cluster")
# Check for missing values
summary(dataset_with_clusters)
## user_id event_time order_id
## Min. :1.516e+18 Min. :2020-01-05 04:01:46.00 Min. :2.294e+18
## 1st Qu.:1.516e+18 1st Qu.:2020-02-19 06:58:02.25 1st Qu.:2.336e+18
## Median :1.516e+18 Median :2020-04-09 04:19:48.00 Median :2.349e+18
## Mean :1.516e+18 Mean :2020-04-13 04:30:01.48 Mean :2.340e+18
## 3rd Qu.:1.516e+18 3rd Qu.:2020-06-05 10:39:16.25 3rd Qu.:2.349e+18
## Max. :1.516e+18 Max. :2020-07-14 16:54:28.00 Max. :2.353e+18
## product_id category_id category_code brand
## Min. :1.516e+18 Min. :2.268e+18 Length:105308 Length:105308
## 1st Qu.:1.516e+18 1st Qu.:2.268e+18 Class :character Class :character
## Median :1.516e+18 Median :2.268e+18 Mode :character Mode :character
## Mean :1.685e+18 Mean :2.273e+18
## 3rd Qu.:1.516e+18 3rd Qu.:2.268e+18
## Max. :2.349e+18 Max. :2.374e+18
## price date year month
## Min. : 0.02 Min. :2020-01-05 Length:105308 Length:105308
## 1st Qu.: 16.18 1st Qu.:2020-02-19 Class :character Class :character
## Median : 50.90 Median :2020-04-09 Mode :character Mode :character
## Mean : 151.00 Mean :2020-04-12
## 3rd Qu.: 178.22 3rd Qu.:2020-06-05
## Max. :18328.68 Max. :2020-07-14
## day_of_week hour_of_day total_purchases average_purchase_value
## Length:105308 Length:105308 Min. : 1.00 Min. : 0.02
## Class :character Class :character 1st Qu.: 2.00 1st Qu.: 81.75
## Mode :character Mode :character Median : 5.00 Median : 121.10
## Mean : 76.04 Mean : 151.00
## 3rd Qu.:144.00 3rd Qu.: 167.53
## Max. :466.00 Max. :6215.25
## recent_purchase days_since_last_purchase category_code_freq
## Min. :2020-01-05 05:49:53.00 Min. : 0.000 Min. :0.0000095
## 1st Qu.:2020-05-08 06:47:21.00 1st Qu.: 5.477 1st Qu.:0.0135127
## Median :2020-06-01 07:52:12.50 Median : 43.377 Median :0.0275288
## Mean :2020-06-01 09:35:51.74 Mean : 43.305 Mean :0.0975819
## 3rd Qu.:2020-07-09 05:27:18.50 3rd Qu.: 67.422 3rd Qu.:0.2670357
## Max. :2020-07-14 16:54:28.00 Max. :191.462 Max. :0.2670357
## brand_freq total_spending average_spending_per_order
## Min. :0.0000095 Min. : 0.02 Min. : 0.02
## 1st Qu.:0.0046530 1st Qu.: 259.21 1st Qu.: 81.75
## Median :0.0131899 Median : 1517.22 Median : 121.10
## Mean :0.0343295 Mean :14901.18 Mean : 151.00
## 3rd Qu.:0.0355339 3rd Qu.:28756.89 3rd Qu.: 167.53
## Max. :0.1423349 Max. :83492.18 Max. :6215.25
## number_of_orders cluster
## Min. : 1.0 Length:105308
## 1st Qu.: 3.0 Class :character
## Median : 9.0 Mode :character
## Mean :115.3
## 3rd Qu.:226.0
## Max. :704.0
dataset_with_clusters <- na.omit(dataset_with_clusters)
reference_date <- as.Date("2020-02-01")
# Aggregate data at the user level
user_aggregated_data <- dataset_with_clusters %>%
group_by(user_id) %>%
summarize(purchased_smartphone = any(date > (reference_date) & grepl("smartphone", category_code, ignore.case = TRUE)))
# Check the distribution of the target variable
table(user_aggregated_data$purchased_smartphone)
##
## FALSE TRUE
## 20295 6569
since February 1, 2020, 6,569 users have purchased at least one smartphone, while 20,291 users have not made any smartphone purchases. This information is crucial for understanding the user behavior in our dataset and will be the foundation for any predictive modeling we plan to do. It also highlights the class imbalance in our target variable, which is an important aspect to consider in our modeling strategy.
reference_date <- as.Date("2020-02-01")
# Calculate mean spending for users who purchased a smartphone after the reference date
mean_spending_smartphone_purchasers <- dataset_with_clusters %>%
filter(date > reference_date & grepl("smartphone", category_code, ignore.case = TRUE)) %>%
summarize(mean_price = mean(price, na.rm = TRUE)) %>%
pull(mean_price)
# Aggregate data at the user level with mean spending and smartphone purchase indicator
user_aggregated_data <- dataset_with_clusters %>%
group_by(user_id) %>%
summarize(
mean_spending = mean(price, na.rm = TRUE),
purchased_smartphone = any(date > reference_date & grepl("smartphone", category_code, ignore.case = TRUE))
) %>%
mutate(higher_spending_non_buyer = ifelse(!purchased_smartphone & mean_spending > mean_spending_smartphone_purchasers, 1, 0))
# Check the distribution of the new variable
table(user_aggregated_data$higher_spending_non_buyer)
##
## 0 1
## 24640 2224
summary(user_aggregated_data)
## user_id mean_spending purchased_smartphone
## Min. :1.516e+18 Min. : 0.02 Mode :logical
## 1st Qu.:1.516e+18 1st Qu.: 33.54 FALSE:20295
## Median :1.516e+18 Median : 97.91 TRUE :6569
## Mean :1.516e+18 Mean : 170.59
## 3rd Qu.:1.516e+18 3rd Qu.: 215.14
## Max. :1.516e+18 Max. :6215.25
## higher_spending_non_buyer
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.08279
## 3rd Qu.:0.00000
## Max. :1.00000
#Table of frequencies stored as 'frequency_table'
frequency_table <- data.frame(
higher_spending_non_buyer = c(0, 1),
count = c(24636, 2224)
)
# Create the bar chart with color
ggplot(frequency_table, aes(x = as.factor(higher_spending_non_buyer), y = count, fill = as.factor(higher_spending_non_buyer))) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("blue", "red"), labels = c("Avg less than smartphone buyers", "Avg higher than smartphone buyers")) +
labs(x = "Higher Spending Non-Buyer", y = "Number of Users",
title = "Non Smartphone buyers with an average spending more than smartphone buyers",
fill = "Category") +
theme_minimal()
# Define the number of top spenders we want to display
top_n_spenders <- 10
# Filter for users with higher_spending_non_buyer equal to 1 and arrange by mean_spending
higher_spending_non_buyers <- user_aggregated_data %>%
filter(higher_spending_non_buyer == 1) %>%
arrange(desc(mean_spending)) %>%
slice(1:top_n_spenders) # Select only the top N spenders
# Visualize the top N higher spending non-buyers
ggplot(higher_spending_non_buyers, aes(x = reorder(as.character(user_id), mean_spending), y = mean_spending, fill = as.character(user_id))) +
geom_bar(stat = "identity") +
labs(x = "User ID", y = "Mean Spending",
title = paste("Top", top_n_spenders, "Non smartphone buyers with high spending")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1), # Rotate x-axis labels for readability
legend.title = element_blank())